Introduction

This report performs Principal Component Analysis (PCA) on TB samples combined with 1000 Genomes data to visualize population structure and identify significant principal components.

Load Libraries

library(openxlsx)
library(ggplot2)
library(ggrepel)
library(scatterplot3d)
library(data.table)
library(reshape)

Part 1: PCA with 1000 Genomes Reference

Load PCA Data and Metadata

# Read in the eigenvectors produced in PLINK
pca <- read.table(params$PCA_1KG_EIGENVEC, 
                  header = FALSE, skip = 0, sep = ' ')
rownames(pca) <- pca[,2]
pca <- pca[,3:ncol(pca)]
colnames(pca) <- paste('PC', c(1:20), sep = '')

# Read in the PED data
PED <- read.xlsx(params$SAMPLE_SHEET, sheet = params$SHEET_NUM)
PED <- PED[which(PED$Individual_ID %in% rownames(pca)), ]
PED <- PED[match(rownames(pca), PED$Individual_ID),]
cat("Sample IDs match:", all(PED$Individual_ID == rownames(pca)), "\n")
## Sample IDs match: TRUE

Prepare Data for Visualization

# Merge PCA results with metadata
pca$sample.name = rownames(pca)
plot <- merge(pca, PED, by.x = "sample.name", by.y = "Individual_ID")

# Create super population categories
plot$super.population <- ifelse(plot$Population %in% c("ACB","ASW","ESN","GWD","LWK","MSL","YRI"), "AFR",
                                ifelse(plot$Population %in% c("CLM","MXL","PEL","PUR"), "AMR",
                                       ifelse(plot$Population %in% c("CDX","CHB","CHS","JPT","KHV"), "EAS",
                                              ifelse(plot$Population %in% c("CEU","FIN","GBR","IBS","TSI"), "EUR",
                                                     ifelse(plot$Population %in% c("BEB","GIH","ITU","PJL","STU"), "SAS", "HIRV TB")))))

cat("Super population distribution:\n")
## Super population distribution:
print(table(plot$super.population))
## 
##     AFR     AMR     EAS     EUR HIRV TB     SAS 
##     661     347     504     503     267     489
# Calculate percentage variance explained
eigenval <- scan(params$PCA_1KG_EIGENVAL)
percentVar <- data.frame(PC = 1:20, pve = sprintf("%.1f", eigenval/sum(eigenval)*100))

# Set factor levels
plot$super.population = factor(plot$super.population, 
                               levels = c("AFR", "AMR", "EAS", "EUR", "SAS", "HIRV TB"))

2D PCA visualisation

# PC1 vs PC2
ggplot(plot, aes(x = PC1, y = PC2, color = super.population)) +
  geom_point(alpha = 0.7, size = 2) +
  labs(title = "PCA: TB Samples with 1000 Genomes Reference",
       x = paste0("PC1 (", percentVar$pve[1], "%)"),
       y = paste0("PC2 (", percentVar$pve[2], "%)"),
       color = "Super Population") +
  theme_minimal() +
  theme(legend.position = "bottom")

# PC3 vs PC4
ggplot(plot, aes(x = PC3, y = PC4, color = super.population)) +
  geom_point(alpha = 0.7, size = 2) +
  labs(title = "PCA: PC3 vs PC4",
       x = paste0("PC3 (", percentVar$pve[3], "%)"),
       y = paste0("PC4 (", percentVar$pve[4], "%)"),
       color = "Super Population") +
  theme_minimal() +
  theme(legend.position = "bottom")

3D PCA visualisation

# 3D PCA for HIRV TB samples only
plot2 = plot[plot$super.population %in% "HIRV TB",]
colors <- c("#999999", "#E69F00", "#56B4E9", "red", "purple")
colors <- colors[as.numeric(as.factor(plot2$Population))]

s3d <- scatterplot3d(plot2$PC1, plot2$PC2, plot2$PC3,
                    xlab = "PC1", ylab = "PC2", zlab = "PC3", 
                    pch = 16, color = colors, main = "3D PCA: HIRV TB Samples")
legend("right", legend = levels(as.factor(plot2$Population)),
       col = c("#999999", "#E69F00", "#56B4E9", "red", "purple"), pch = 16)

Part 2: PCA with TB samples only

Load TB-only PCA Data

# Read in eigenvectors for TB samples only
pca_tb <- read.table(params$PCA_TB_EIGENVEC, 
                     header = FALSE, skip = 0, sep = ' ')
rownames(pca_tb) <- pca_tb[,2]
pca_tb <- pca_tb[,3:ncol(pca_tb)]
colnames(pca_tb) <- paste('PC', c(1:20), sep = '')

# Merge with metadata
pca_tb$sample.name = rownames(pca_tb)
plot_tb <- merge(pca_tb, PED, by.x = "sample.name", by.y = "Individual_ID")

cat("Population distribution in TB samples:\n")
## Population distribution in TB samples:
print(table(plot_tb$Population))
## 
## AFR AMR EAS EUR SAS 
##  74   9  44  85  55
# Calculate percentage variance explained for TB-only data
eigenval_tb <- scan(params$PCA_TB_EIGENVAL)
percentVar_tb <- data.frame(PC = 1:20, pve = sprintf("%.1f", eigenval_tb/sum(eigenval_tb)*100))

TB-only PCA visualisation

# 2D PCA for TB samples
ggplot(plot_tb, aes(x = PC1, y = PC2, color = Population)) +
  geom_point(alpha = 0.7, size = 3) +
  labs(title = "PCA: TB Samples Only",
       x = paste0("PC1 (", percentVar_tb$pve[1], "%)"),
       y = paste0("PC2 (", percentVar_tb$pve[2], "%)")) +
  theme_minimal() +
  theme(legend.position = "bottom")

# 3D PCA for TB samples
colors_tb <- c("#999999", "#E69F00", "#56B4E9", "red", "purple")
colors_tb <- colors_tb[as.numeric(as.factor(plot_tb$Population))]

s3d_tb <- scatterplot3d(plot_tb$PC1, plot_tb$PC2, plot_tb$PC3,
                       xlab = "PC1", ylab = "PC2", zlab = "PC3", 
                       pch = 16, color = colors_tb, 
                       main = "3D PCA: All TB Samples")
legend("right", legend = levels(as.factor(plot_tb$Population)),
       col = c("#999999", "#E69F00", "#56B4E9", "red", "purple"), pch = 16)

Part 3: determine significant PCs associated with population structure

# Test each PC for association with population
dat2 = list()
t = colnames(plot_tb[,2:21]) # 20 PCs

for (i in t) {
  formula <- as.formula(paste(i, "~ Population"))
  fit = lm(formula, data = plot_tb)
  
  dat2[[i]] = data.frame(
    beta.AMR = coef(fit)[2],
    beta.EAS = coef(fit)[3],
    beta.EUR = coef(fit)[4],
    beta.SAS = coef(fit)[5],
    p.valueAMR = coef(summary(fit))[2,4],
    p.valueEAS = coef(summary(fit))[3,4],
    p.valueEUR = coef(summary(fit))[4,4],
    p.valueASA = coef(summary(fit))[5,4],
    PC = gsub("PC", "", i)
  )
}

dat2 = as.data.frame(do.call(rbind, dat2))

# Adjust p-values for multiple testing
dat2$adjp.AMR = p.adjust(dat2$p.valueAMR, method = "fdr")
dat2$adjp.EAS = p.adjust(dat2$p.valueEAS, method = "fdr")
dat2$adjp.EUR = p.adjust(dat2$p.valueEUR, method = "fdr")
dat2$adjp.ASA = p.adjust(dat2$p.valueASA, method = "fdr")

cat("Significant PC associations found:\n")
## Significant PC associations found:
print(dat2[dat2$adjp.AMR < 0.05 | dat2$adjp.EAS < 0.05 | 
           dat2$adjp.EUR < 0.05 | dat2$adjp.ASA < 0.05, ])
##         beta.AMR     beta.EAS     beta.EUR     beta.SAS   p.valueAMR
## PC1   0.11716604  0.132404184  0.129304677  0.126388815 1.390430e-40
## PC2  -0.01118028  0.124866014 -0.056439989  0.002814405 1.491730e-02
## PC3   0.02558498  0.038908064  0.034978540 -0.117702057 1.485887e-05
## PC4  -0.02570894 -0.022955059 -0.044002112 -0.024210739 2.210186e-01
## PC5   0.23104468 -0.015369855 -0.017417395 -0.002417023 5.574076e-38
## PC6   0.08191100 -0.001768767  0.003845047 -0.009570937 1.290690e-04
## PC11 -0.05598031 -0.004919880 -0.003174231 -0.006628056 9.841831e-03
##         p.valueEAS    p.valueEUR    p.valueASA PC     adjp.AMR      adjp.EAS
## PC1   1.216213e-96 1.760596e-111  1.528568e-98  1 2.780859e-39  1.216213e-95
## PC2  1.474602e-137  4.195669e-79  2.223446e-01  2 4.972435e-02 2.949204e-136
## PC3   2.976649e-28  1.564692e-31 3.248112e-114  3 9.905913e-05  1.984432e-27
## PC4   4.324417e-02  4.992231e-06  2.277056e-02  4 2.957391e-01  2.162208e-01
## PC5   6.122508e-02  1.131540e-02  7.521601e-01  5 5.574076e-37  2.449003e-01
## PC6   8.764553e-01  6.857693e-01  3.687293e-01  6 6.453448e-04  9.770053e-01
## PC11  6.720650e-01  7.436359e-01  5.420597e-01 11 3.936732e-02  9.770053e-01
##           adjp.EUR      adjp.ASA
## PC1  3.521192e-110  1.528568e-97
## PC2   4.195669e-78  9.512302e-01
## PC3   1.043128e-30 6.496223e-113
## PC4   2.496116e-05  1.518038e-01
## PC5   4.526159e-02  9.512302e-01
## PC6   9.738379e-01  9.512302e-01
## PC11  9.738379e-01  9.512302e-01

Visualise significant PCs

# Prepare data for visualization
df = dat2[,c("adjp.AMR", "adjp.EAS", "adjp.EUR", "adjp.ASA", "PC")]
df = melt(df, id = "PC")

# Create bar plot of adjusted p-values
ggplot(df, aes(reorder(PC, as.numeric(PC)), -log10(value), fill = variable)) +
  geom_bar(stat = "identity", position = position_dodge()) +  
  theme_bw() + 
  theme(panel.grid.minor = element_blank(),
        axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(x = "Principal Component", 
       y = "-Log10(Adjusted P-value)",
       title = "Association of PCs with Population Structure",
       fill = "Population") +
  geom_hline(yintercept = -log10(0.05), linetype = "dotted", color = "red")

Summary

cat("### Analysis Summary\n")
## ### Analysis Summary
cat("- Total samples in 1KG + TB analysis:", nrow(plot), "\n")
## - Total samples in 1KG + TB analysis: 2771
cat("- Total samples in TB-only analysis:", nrow(plot_tb), "\n")
## - Total samples in TB-only analysis: 267
cat("- Number of significant PC-population associations:", 
sum(dat2$adjp.EAS < 0.05 | dat2$adjp.EUR < 0.05 | dat2$adjp.ASA < 0.05), "\n")
## - Number of significant PC-population associations: 5
#cat("- PCs with ≥2 adj-p < 0.05:",
#    sum(rowSums(dat2[, c("adjp.AMR","adjp.EAS","adjp.EUR","adjp.ASA")] < 0.05, na.rm = TRUE) >= 2), "\n")

Session Information

sessionInfo()
## R version 4.3.2 (2023-10-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Rocky Linux 8.10 (Green Obsidian)
## 
## Matrix products: default
## BLAS/LAPACK: FlexiBLAS OPENBLAS;  LAPACK version 3.11.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Europe/London
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] reshape_0.8.9        data.table_1.16.2    scatterplot3d_0.3-44
## [4] ggrepel_0.9.6        ggplot2_3.5.1        openxlsx_4.2.8      
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6     jsonlite_1.8.7   highr_0.10       dplyr_1.1.4     
##  [5] compiler_4.3.2   tidyselect_1.2.1 Rcpp_1.0.13-1    zip_2.3.0       
##  [9] jquerylib_0.1.4  scales_1.3.0     yaml_2.3.7       fastmap_1.1.1   
## [13] plyr_1.8.9       R6_2.5.1         labeling_0.4.3   generics_0.1.3  
## [17] knitr_1.45       tibble_3.2.1     munsell_0.5.1    bslib_0.5.1     
## [21] pillar_1.9.0     rlang_1.1.4      utf8_1.2.4       cachem_1.0.8    
## [25] stringi_1.8.4    xfun_0.41        sass_0.4.7       cli_3.6.3       
## [29] withr_3.0.2      magrittr_2.0.3   digest_0.6.33    grid_4.3.2      
## [33] lifecycle_1.0.4  vctrs_0.6.5      evaluate_0.23    glue_1.8.0      
## [37] farver_2.1.2     fansi_1.0.6      colorspace_2.1-1 rmarkdown_2.25  
## [41] tools_4.3.2      pkgconfig_2.0.3  htmltools_0.5.7